In [1]:
import prody as pr
import py3Dmol
import tempfile
# --- notebook parameters (non-interactive) ---
import os, sys, asyncio
from pathlib import Path

import pandas as pd
from Analysis.PlotUtils import *
from scipy.ndimage import binary_dilation
import matplotlib.patches as mpatches
from IPython.display import display, HTML
from Bio import PDB
from Bio import pairwise2
from Bio.PDB import Superimposer
import py3Dmol
import ipywidgets as widgets
from Bio.PDB import PDBParser
from Bio import pairwise2
import py3Dmol
from Bio import PDB, Align
from Bio.PDB import PDBParser, PDBIO
# from Bio.PDB.Polypeptide import protein_letters_3to1
from io import StringIO
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, BoundaryNorm
from matplotlib.patches import Patch
from config import *
from utils.utils import *
from utils.protein_utils import extract_protein_sequence
from utils.protein_plot_utils import *
CONFIG FOLD-SWITCH PIEPLINE WITH USER: orzuk  ENVIRONMENT: Linux
PyRosetta not installed; skipping energy/ΔG panels
In [2]:
# Silence Windows asyncio warning from zmq/tornado
if sys.platform.startswith("win"):
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

# Get pair from environment (set by generate_notebooks.py)
fold_pair = os.getenv("PAIR_ID", "").strip()

if not fold_pair:
    # We are likely running interactively in Jupyter UI (not nbconvert).
    # Fall back to input() ONLY when stdin is available.
    try:
        fold_pair = input("Enter fold-switch pair (e.g., 3hdfA_3hdeA): ").strip()
    except Exception:
        raise RuntimeError(
            "PAIR_ID not provided and no stdin available. "
            "When running via generate_notebooks.py, PAIR_ID is set automatically."
        )

# Normalize and split into the two PDB+chain ids
if "_" not in fold_pair:
    raise ValueError(f"PAIR_ID must look like 'A_B', got: {fold_pair}")
pdb1, pdb2 = fold_pair.split("_", 1)

print("Using pair:", pdb1, pdb2)
Using pair: 2mwfA 2nntA
In [3]:
plot_tool = PlotTool(folder=DATA_DIR, fold_pair=fold_pair)
@> 541 atoms and 20 coordinate set(s) were parsed in 0.08s.
@> 1944 atoms and 10 coordinate set(s) were parsed in 0.18s.
In [4]:
VIZ_FOLDER    = f'{DATA_DIR}/{fold_pair}/output_cmap_esm/VizCmaps'

print("Fold-pair,", fold_pair, flush=True)
chains = (pdb1[-1].upper(), pdb2[-1].upper())
print("chains,", chains, flush=True)
pdb_path1 = f"{DATA_DIR}/{fold_pair}/{pdb1[:-1]}.pdb"
pdb_path2 = f"{DATA_DIR}/{fold_pair}/{pdb2[:-1]}.pdb"
#fold_pair_subdir = f"{fold_pair[0]}_{fold_pair[1]}"
Fold-pair, 2mwfA_2nntA
chains, ('A', 'A')
In [5]:
seq_fold1 = extract_protein_sequence(pdb_path1, chain=chains[0], ca_only=False)
seq_fold2 = extract_protein_sequence(pdb_path2, chain=chains[1], ca_only=False)

Visualizations and results¶

Original Structure Visualization¶

In [6]:
pdb_file1 = read_pdb_file(pdb_path1)
plot_tool.plot_single_fold(pdb_file1,label=plot_tool.fold1)

pdb_file2 = read_pdb_file(pdb_path2)
plot_tool.plot_single_fold(pdb_file2,label=plot_tool.fold2)

3Dmol.js failed to load for some reason. Please check your browser console for error messages.

3Dmol.js failed to load for some reason. Please check your browser console for error messages.

In [7]:
try:
    plot_tool.plot_fold_alignement_(pdb_path1,pdb_path1)
except:
    print('Structural Alignment Plot fail !')
@> 541 atoms and 20 coordinate set(s) were parsed in 0.01s.
@> 541 atoms and 20 coordinate set(s) were parsed in 0.01s.
2mwf_TEMP
Blue:2mwf
Red:2mwf

3Dmol.js failed to load for some reason. Please check your browser console for error messages.

AlphaFold predictions¶

Best AF prediction to fold 1¶

In [8]:
from pathlib import Path

df_af = load_csv_or_none(f"{plot_tool.folder}/{fold_pair}/Analysis/df_af.csv")
if df_af is not None:
    # keep your existing filters
    df_af = df_af[df_af.cluster_num != 'Query'].iloc[:, 1:-1]
    print(f"[run] AlphaFold df ready: {len(df_af)} rows")
else:
    print("[skip] AlphaFold section (df_af.csv not available)")
[ok] loaded: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/2mwfA_2nntA/Analysis/df_af.csv  rows=136
[run] AlphaFold df ready: 136 rows
In [9]:
MAX_AF_1, max_af_pdb1 = '', ''
if df_af is not None and not df_af.empty:
    try:
        max_af_pdb1 = df_af[df_af.score_pdb1 > df_af.score_pdb2].sort_values(by='score_pdb1',ascending=False).pdb_file.iloc[0]
        max_af_pdb1 = f'{plot_tool.folder}/{fold_pair}/AF_preds/{max_af_pdb1}'
        try:
            MAX_AF_1 = max_af_pdb1.split('/')[-1][11:14]
        except Exception as e:
            print(e)
        score = df_af[df_af.score_pdb1 > df_af.score_pdb2].sort_values(by='score_pdb1',ascending=False).score_pdb1.iloc[0]
        plot_tool.align_and_visualize_pdb(max_af_pdb1,pdb_path1,score)
    except Exception as e2:
        print(f"[skip] AF fold1 alignment: {e2}")
        try:
            plot_tool.plot_fold_alignement_(max_af_pdb1,pdb_path1)
        except Exception as e3:
            print(f"[skip] AF fold1 fallback visualization: {e3}")
else:
    print("[skip] AF fold1: df_af not available")
[skip] AF fold1 alignment: 'DataFrame' object has no attribute 'score_pdb1'
[skip] AF fold1 fallback visualization:  is not a valid filename or a valid PDB identifier.

Best AF prediction to fold 2¶

In [10]:
MAX_AF_2, max_af_pdb2 = '', ''
if df_af is not None and not df_af.empty:
    try:
        max_af_pdb2 = df_af[df_af.score_pdb1 < df_af.score_pdb2].sort_values(by='score_pdb2',ascending=False).pdb_file.iloc[0]
        max_af_pdb2 = f'{plot_tool.folder}/{fold_pair}/AF_preds/{max_af_pdb2}'
        try:
            MAX_AF_2 = max_af_pdb2.split('/')[-1][11:14]
        except Exception as e:
            print(e)
        score = df_af[df_af.score_pdb1 < df_af.score_pdb2].sort_values(by='score_pdb2',ascending=False).score_pdb2.iloc[0]
        plot_tool.align_and_visualize_pdb(max_af_pdb2,pdb_path2,score)
    except Exception as e2:
        print(f"[skip] AF fold1 alignment: {e2}")
        try:
            plot_tool.plot_fold_alignement_(max_af_pdb2,pdb_path2)
        except Exception as e3:
            print(f"[skip] AF fold1 fallback visualization: {e3}")
else:
    print("[skip] AF fold1: df_af not available")
[skip] AF fold1 alignment: 'DataFrame' object has no attribute 'score_pdb1'
[skip] AF fold1 fallback visualization:  is not a valid filename or a valid PDB identifier.

EsmFold Prediction¶

In [11]:
from pathlib import Path
import os, pandas as pd

# Inputs from the generator (below patch passes these env vars)
ROOT = Path(os.environ.get("MSACLUSTER_ROOT", ".")).resolve()
PAIR_ID = os.environ.get("PAIR_ID", None) or fold_pair  # keep using fold_pair if already defined

PAIR_ANALYSIS = ROOT / "Pipeline" / PAIR_ID / "Analysis"
GLOBAL_RES    = ROOT / "Pipeline_res"

def _read_csv_or_none(p: Path):
    return pd.read_csv(p) if p.exists() and p.stat().st_size > 1 else None

# Prefer per-pair CSV; only fall back to global CSV if really needed
df_esmfold_analysis = _read_csv_or_none(PAIR_ANALYSIS / "df_esm.csv")
if df_esmfold_analysis is None:
    df_esmfold_analysis = _read_csv_or_none(GLOBAL_RES / "df_esmfold_analysis.csv")

df_esmfold = None
if df_esmfold_analysis is not None:
    # Your original filtering logic
    df_esmfold = df_esmfold_analysis[df_esmfold_analysis.get("fold_pair", PAIR_ID) == PAIR_ID]
    if "fold" in df_esmfold.columns:
        df_esmfold = df_esmfold[df_esmfold["fold"].astype(str).str.contains("ShallowMsa")]
    print(f"[run] ESMFold df ready: {len(df_esmfold)} rows")
    display(df_esmfold.head())
else:
    print("[skip] ESMFold section (no per-pair df_esm.csv and no global df_esmfold_analysis.csv)")
[run] ESMFold df ready: 660 rows
fold_pair model cluster_num name TMscore_fold1 TMscore_fold2 TMdiff
0 2mwfA_2nntA esm2 ShallowMsa_000 ShallowMsa_000__sample_001 0.050694 0.031461 0.019233
1 2mwfA_2nntA esm2 ShallowMsa_000 ShallowMsa_000__sample_002 0.055005 0.031751 0.023255
2 2mwfA_2nntA esm2 ShallowMsa_000 ShallowMsa_000__sample_003 0.054277 0.030320 0.023957
3 2mwfA_2nntA esm2 ShallowMsa_000 ShallowMsa_000__sample_004 0.055005 0.031751 0.023255
4 2mwfA_2nntA esm2 ShallowMsa_000 ShallowMsa_000__sample_005 0.045914 0.030729 0.015185

Best EsmFold prediction to fold 1¶

In [12]:
if df_esmfold is not None and not df_esmfold.empty:
    try:
        # keep your original selection logic here (fixed snippets below are placeholders)
        max_esm_pdb1 = df_esmfold.sort_values(by='TMscore_fold1', ascending=False).fold.iloc[0]
        score = df_esmfold.sort_values(by='TMscore_fold1', ascending=False).TMscore_fold1.iloc[0]
        plot_tool.align_and_visualize_pdb(pdb_path1, f'{plot_tool.folder}/{fold_pair}/output_esm_fold/{max_esm_pdb1}', score)
    except Exception as e:
        print(f"[skip] ESM fold1 align/plot: {e}")
        try:
            plot_tool.plot_fold_alignement_(pdb_path1, f'{plot_tool.folder}/{fold_pair}/output_esm_fold/{max_esm_pdb1}')
        except Exception as e2:
            print(f"[skip] ESM fold1 fallback visualization: {e2}")
else:
    print("[skip] ESM fold1: df_esmfold not available")
[skip] ESM fold1 align/plot: 'DataFrame' object has no attribute 'fold'
[skip] ESM fold1 fallback visualization: name 'max_esm_pdb1' is not defined

Best EsmFold prediction to fold 2¶

In [13]:
if df_esmfold is not None and not df_esmfold.empty:
    try:
        max_esm_pdb2 = df_esmfold.sort_values(by='TMscore_fold2', ascending=False).fold.iloc[0]
        score = df_esmfold.sort_values(by='TMscore_fold2', ascending=False).TMscore_fold2.iloc[0]
        plot_tool.align_and_visualize_pdb(pdb_path2, f'{plot_tool.folder}/{fold_pair}/output_esm_fold/{max_esm_pdb2}', score)
        print(score)
    except Exception as e:
        print(f"[skip] ESM fold2 align/plot: {e}")
        try:
            print(score)
            plot_tool.plot_fold_alignement_(pdb_path2, f'{plot_tool.folder}/{fold_pair}/output_esm_fold/{max_esm_pdb2}')
        except Exception as e2:
            print(f"[skip] ESM fold2 fallback visualization: {e2}")
else:
    print("[skip] ESM fold2: df_esmfold not available")
[skip] ESM fold2 align/plot: 'DataFrame' object has no attribute 'fold'
[skip] ESM fold2 fallback visualization: name 'score' is not defined
In [14]:
if df_esmfold is not None:
    df_esmfold
else:
    print("[skip] ESM df display (not available)")

Contact maps Msa Transformers¶

In [15]:
# Robust load of CMap CSV referenced by CMAP_RES_PATH (if defined)
df_cmap = None
try:
    CMAP_RES_PATH  # just to see if it exists
    df_cmap = load_csv_or_none(CMAP_RES_PATH)
except NameError:
    print("[skip] CMAP_RES_PATH is not defined in this notebook")

MAX_RECALL_1 = MAX_RECALL_2 = None
MAX_CLUSTER_RECALL_1 = MAX_CLUSTER_RECALL_2 = ''

if df_cmap is not None and not df_cmap.empty:
    df_cmap = df_cmap[df_cmap.fold_pair == fold_pair]
    try:
        MAX_RECALL_1 = df_cmap[df_cmap.recall_only_fold1 > df_cmap.recall_only_fold2] \
            .sort_values(by='recall_only_fold1', ascending=False).iloc[0]
        MAX_CLUSTER_RECALL_1 = MAX_RECALL_1.File[-7:-4]
    except Exception as e:
        print(f"[skip] MAX_RECALL_1: {e}")

    try:
        MAX_RECALL_2 = df_cmap[df_cmap.recall_only_fold1 < df_cmap.recall_only_fold2] \
            .sort_values(by='recall_only_fold2', ascending=False).iloc[0]
        MAX_CLUSTER_RECALL_2 = MAX_RECALL_2.File[-7:-4]
    except Exception as e:
        print(f"[skip] MAX_RECALL_2: {e}")
else:
    print("[skip] df_cmap not available")
[skip] missing CSV: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/data/df_cmap_all.csv
[skip] df_cmap not available
In [16]:
viz_folder = VIZ_FOLDER
if MAX_CLUSTER_RECALL_1 != '':
    file_max_recall_1 = f'{viz_folder}/msa_t__ShallowMsa_{MAX_CLUSTER_RECALL_1}_visualization_map_1_tol_0.npy'
else:
    file_max_recall_1 = ''

if MAX_CLUSTER_RECALL_2 != '':
    file_max_recall_2 = f'{viz_folder}/msa_t__ShallowMsa_{MAX_CLUSTER_RECALL_2}_visualization_map_2_tol_0.npy'
else:
    file_max_recall_2 = ''
In [ ]:
 
In [17]:
# build AF npy path explicitly and debug-print
af1_npy = (f"{VIZ_FOLDER}/msa_t__ShallowMsa_{MAX_AF_1}_visualization_map_1_tol_0.npy"
    if MAX_AF_1 else "")

print("VIZ_FOLDER:", VIZ_FOLDER, flush=True)
print("file_max_recall_1:", file_max_recall_1, "exists?", os.path.exists(file_max_recall_1), flush=True)
print("af1_npy:", af1_npy, "exists?", os.path.exists(af1_npy) if af1_npy else None, flush=True)

print("fold_pair:", fold_pair)
print("viz_folder:", viz_folder)
print("MAX_CLUSTER_RECALL_1:", MAX_CLUSTER_RECALL_1, "MAX_CLUSTER_RECALL_2:", MAX_CLUSTER_RECALL_2)


if df_cmap is not None and not df_cmap.empty:
    if file_max_recall_1 and af1_npy and os.path.exists(file_max_recall_1) and os.path.exists(af1_npy):
        af_cmap_recall = df_cmap[df_cmap.File.str.contains(MAX_AF_1)].recall_only_fold1.iloc[0]
        plot_viz_cmap2(file_max_recall_1, af1_npy,
                       legend_plot=f'Best Recall Fold 1 ({getattr(MAX_RECALL_1, "recall_only_fold1", "NA")}) '
                                   f'Vs Best AF 1 ({af_cmap_recall})')
    elif file_max_recall_1 and os.path.exists(file_max_recall_1):
        plot_viz_cmap(file_max_recall_1,
                      legend_plot=f'Best Recall Fold 1 ({getattr(MAX_RECALL_1, "recall_only_fold1", "NA")})')
    elif af1_npy and os.path.exists(af1_npy):
        af_cmap_recall = df_cmap[df_cmap.File.str.contains(MAX_AF_1)].recall_only_fold1.iloc[0]
        plot_viz_cmap(af1_npy, legend_plot=f'Best AF 1 ({af_cmap_recall})')
    else:
        print("[skip] fold1 viz: missing both maps", flush=True)
else:
    print("[skip] cmap fold1 viz: df_cmap not available", flush=True)
VIZ_FOLDER: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/2mwfA_2nntA/output_cmap_esm/VizCmaps
file_max_recall_1:  exists? False
af1_npy:  exists? None
fold_pair: 2mwfA_2nntA
viz_folder: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/2mwfA_2nntA/output_cmap_esm/VizCmaps
MAX_CLUSTER_RECALL_1:  MAX_CLUSTER_RECALL_2: 
[skip] cmap fold1 viz: df_cmap not available
In [18]:
af2_npy = (f"{VIZ_FOLDER}/msa_t__ShallowMsa_{MAX_AF_2}_visualization_map_2_tol_0.npy"
    if MAX_AF_2 else "")

print("VIZ_FOLDER:", VIZ_FOLDER, flush=True)
print("file_max_recall_2:", file_max_recall_2, "exists?", os.path.exists(file_max_recall_2), flush=True)
print("af2_npy:", af2_npy, "exists?", os.path.exists(af2_npy) if af2_npy else None, flush=True)

if df_cmap is not None and not df_cmap.empty:
    if file_max_recall_2 and af2_npy and os.path.exists(file_max_recall_2) and os.path.exists(af2_npy):
        af_cmap_recall = df_cmap[df_cmap.File.str.contains(MAX_AF_2)].recall_only_fold2.iloc[0]
        plot_viz_cmap2(file_max_recall_2, af2_npy,
                       legend_plot=f'Best Recall Fold 2 ({getattr(MAX_RECALL_2, "recall_only_fold2", "NA")}) '
                                   f'Vs Best AF 2 ({af_cmap_recall})')
    elif file_max_recall_2 and os.path.exists(file_max_recall_2):
        plot_viz_cmap(file_max_recall_2,
                      legend_plot=f'Best Recall Fold 2 ({getattr(MAX_RECALL_2, "recall_only_fold2", "NA")})')
    elif af2_npy and os.path.exists(af2_npy):
        af_cmap_recall = df_cmap[df_cmap.File.str.contains(MAX_AF_2)].recall_only_fold2.iloc[0]
        plot_viz_cmap(af2_npy, legend_plot=f'Best AF 2 ({af_cmap_recall})')
    else:
        print("[skip] fold2 viz: missing both maps", flush=True)
else:
    print("[skip] cmap fold2 viz: df_cmap not available", flush=True)
VIZ_FOLDER: /sci/labs/orzuk/orzuk/github/MsaCluster/Pipeline/2mwfA_2nntA/output_cmap_esm/VizCmaps
file_max_recall_2:  exists? False
af2_npy:  exists? None
[skip] cmap fold2 viz: df_cmap not available
In [ ]: